//extract text from html with tika
package docx;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;

import org.xml.sax.ContentHandler;

public class extractTextTika {
	public static void main(String[] args) throws IOException{
		InputStream is = null;
		try {

	         is = new FileInputStream("C:/Users/nnn/Documents/eclipse/swp/ttt/sample-docs/docx/doc.html");
	         ContentHandler contenthandler = new BodyContentHandler();
	         Metadata metadata = new Metadata();
	         Parser parser = new AutoDetectParser();
	         parser.parse(is, contenthandler, metadata, new ParseContext());
	         System.out.println(contenthandler.toString());
	    }
	    catch (Exception e) {
	      e.printStackTrace();
	    }
	    finally {
	        if (is != null) is.close();
	    }
		
	}

}
